library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(mgcv) # gams
library(FNN) # Nearest neighbours
Load preprocessed data (Preprocessing/Gandal/AllRegions/20_02_27_data_preprocessing.html)
# Gandal dataset
load('./../../Gandal/AllRegions/Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../Gandal/AllRegions/Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% dplyr::mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
dplyr::mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% dplyr::mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
dplyr::mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
rm(dds, DE_info, GO_annotations, GO_neuronal, datGenes)
The proportion of over- and under-expressed genes in each SFARI Gene score is not very different to the proportion in the genes iwth Neuronal annotations nor in the rest of the genes (good, something less to worry about)
aux = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
dplyr::mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed'))
plot_data = aux %>% group_by(gene.score, direction) %>% tally(name = 'p') %>% left_join(aux %>% group_by(gene.score) %>% tally, by = 'gene.score') %>%
mutate(p = p/n, y=1)
plot_data %>% ggplot(aes(gene.score, p, fill=direction)) + geom_bar(stat='identity') +
geom_hline(yintercept = mean(plot_data$p[plot_data$direction=='under-expressed']), linetype = 'dashed', color = 'white') +
ylab('Percentage') + xlab('SFARI Gene Scores') + ggtitle('Direction of Fold-Change in genes by SFARI Score') + theme_minimal()
rm(aux)
There is a negative relation between the magnitude of the LFC and the SFARI Gene scores. This would suggest that DEA is not a good approach to identify new SFARI Genes
ggplotly(genes_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,5,4))) +
ylab('logFoldChange Magnitude') + xlab('SFARI Gene Score') + theme_minimal() + theme(legend.position='none'))
We know that in general there is a negative relation between mean expression and lFC in genes, and we also know that there is a strong relation between SFARI Gene Scores and the mean level of expression of the genes.
This could explain the behaviour we found above, but we want to see if, once you control for the level of expression, the SFARI genes continue to have this relation to LFC or if it dissapears. (Being optimistic, perhaps the SFARI genes actually have higher LFC than genes with similar levels of expression, but we can’t see this in the original plot because of the relation between level of expression and LFC)
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score, significant) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
mutate(alpha = ifelse(gene.score == 'Others' , 0.1, ifelse(gene.score == 'Neuronal', 0.3, 0.7)))
p1 = plot_data %>% ggplot(aes(gene.score, meanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,5,4))) + theme_minimal() +
ylab('Mean Level of Expression') + xlab('SFARI Gene Score') + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) + geom_point(alpha=plot_data$alpha) + geom_smooth(method='lm', color='#999999') +
ylab('LogFoldChange Magnitude') + xlab('Mean Expression') + scale_color_manual(values=SFARI_colour_hue(r=c(1:3,5,4))) +
theme_minimal() + theme(legend.position = 'none')
p2 = ggExtra::ggMarginal(p2, type='density',groupColour = TRUE, size=10)
grid.arrange(p2, p1, ncol=2, widths = c(0.6, 0.4))
rm(p1,p2)
We want to know what happens to the originally negative relation we found between SFARI Gene scores and lFC magnitude when we control for level of expression.
To do this, I’m going to compare each SFARI Gene with its closest non-SFARI neighbours following these steps:
Select one SFARI gene
Select its neighbours: 100 non-SFARI genes with the most similar mean level of Expression
Standardise the lFC magnitude of each of the neighbours and of the SFARI gene (using the mean and sd of the lFC magnitude of only these 101 genes)
Repeat this for each of the SFARI Genes, saving the standardised lFC magnitudes of all the SFARI genes and all the neighbours
Compare the distribution of this value between these two groups (SFARI and their neighbours)
This plot shows the general idea of steps 1, 2, and 3, selecting a random SFARI gene:
The plot on the left shows the original mean expression and lFC magnitude of the SFARI Gene and its 100 closest neighbours
The plot on the right shows the standardised lFC mangitude of the genes, and the vertical lines represent the value that is going to be recorded for each of this genes to be compared afterwards
n = 100
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')
SFARI_gene = plot_data %>% filter(gene.score %in% c('1','2','3','4','5','6')) %>% sample_n(1) %>% mutate(d=0, alpha = 1)
nn = plot_data %>% filter(gene.score %in% c('Neuronal','Others')) %>% mutate(d = abs(meanExpr-SFARI_gene$meanExpr), alpha=0.5) %>% top_n(n=-n, wt = d)
plot_data = rbind(SFARI_gene, nn) %>% mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange)))
p1 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) + geom_point(alpha = plot_data$alpha) +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),5,4))) +
xlab('Mean Expression') + ylab('Log2 Fold Change Magnitude') + theme_minimal() + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, std_magnitude, color = gene.score)) + geom_point(alpha = plot_data$alpha) +
geom_hline(aes(yintercept = mean(std_magnitude)), linetype = 'dashed', color = '#999999') +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),5,4))) +
geom_segment(aes(x=SFARI_gene$meanExpr, y=mean(std_magnitude), xend = SFARI_gene$meanExpr, yend = std_magnitude[1]),
alpha = 0.5, color = SFARI_colour_hue(r=1:8)[as.numeric(SFARI_gene$gene.score)]) +
xlab('Mean Expression') + ylab('Standardised LFC Magnitude') + theme_minimal() + theme(legend.position='none')
for(i in 1:15){
random_sample = plot_data %>% filter(gene.score != SFARI_gene$gene.score) %>% sample_n(1)
p2 = p2 + geom_segment(x=random_sample$meanExpr, xend = random_sample$meanExpr, y=mean(plot_data$std_magnitude), yend = random_sample$std_magnitude,
alpha = 0.5, color = 'gray')
}
grid.arrange(p1, p2, ncol=2, top='Comparing SFARI Genes with their n closest neighbours by Mean Expression')
cat(paste0('SFARI gene\'s standardised distance to its neigbours\'s LFC magnitude: ', round(plot_data$std_magnitude[1],4)))
## SFARI gene's standardised distance to its neigbours's LFC magnitude: -1.1426
rm(p1, p2, SFARI_gene, nn, random_sample, i)
As steps 4, and 5, say, we repeat this for all of the SFARI Genes, recording their standardised mangnitude as well as the ones from their neighbours so we can study them all together
Even when controlling for the relation between Mean Expression and LFC by comparing each SFARI Gene only with neighbouring genes, we see the same results as before!
Neuronal genes have higher magnitudes of lFC than non-SFARI, non-neuronal genes consistently (makes sense)
The higher the SFARI Gene score, the lower the LFC (even with genes with the same level of expression!)
Only SFARI Genes with score 6 have LFC magnitudes higher than their neighbours
get_std_lfc_magnitudes = function(data_info, SFARI_score, n){
SFARI_genes = data_info %>% filter(gene.score == as.character(SFARI_score))
std_magnitudes = data.frame(gene.score = as.character(), std_magnitude = as.numeric)
for(i in 1:nrow(SFARI_genes)){
SFARI_gene = SFARI_genes[i,]
nn = data_info %>% filter(gene.score %in% c('Neuronal','Others')) %>% mutate(d = abs(meanExpr-SFARI_gene$meanExpr)) %>% top_n(n=-n, wt = d) %>% dplyr::select(-d)
iter_data = rbind(SFARI_gene, nn) %>% mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange))) %>%
dplyr::select(gene.score, std_magnitude)
std_magnitudes = rbind(std_magnitudes, iter_data)
}
return(std_magnitudes)
}
data_info = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')
std_magnitudes_score_1 = get_std_lfc_magnitudes(data_info, 1, n)
std_magnitudes_score_2 = get_std_lfc_magnitudes(data_info, 2, n)
std_magnitudes_score_3 = get_std_lfc_magnitudes(data_info, 3, n)
p1 = std_magnitudes_score_1 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('Standardised LFC Magnitude') +
scale_fill_manual(values=SFARI_colour_hue(r=c(1,5,4))) + theme_minimal() + theme(legend.position = 'none')
p2 = std_magnitudes_score_2 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
scale_fill_manual(values=SFARI_colour_hue(r=c(2,5,4))) + theme_minimal() + theme(legend.position = 'none')
p3 = std_magnitudes_score_3 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
scale_fill_manual(values=SFARI_colour_hue(r=c(3,5,4))) + theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1,p2,p3,nrow=1, top = 'Comparison of LFC Magnitude of SFARI gens and their closest neighbours by Mean Expression')
rm(p1,p2,p3)#,std_magnitudes_score_1, std_magnitudes_score_2, std_magnitudes_score_3)
Hypothesis test to see if SFARI Genes by score and their neighbours have the same mean:
Null hypothesis: Differences in means is equal to zero
get_t_test_vals = function(std_magnitudes_score, score){
t_test_Neuronal = t.test(std_magnitude ~ gene.score, data = std_magnitudes_score[std_magnitudes_score$gene.score != 'Others',], var.equal=FALSE)
t_test_others = t.test(std_magnitude ~ gene.score, data = std_magnitudes_score[std_magnitudes_score$gene.score != 'Neuronal',], var.equal=FALSE)
return(rbind(c(score, 'Neuronal', t_test_Neuronal$estimate[1][[1]], t_test_Neuronal$estimate[2][[1]], t_test_Neuronal$p.value),
c(score, 'Others', t_test_others$estimate[1][[1]], t_test_others$estimate[2][[1]], t_test_others$p.value)))
}
results_1 = get_t_test_vals(std_magnitudes_score_1, 1)
results_2 = get_t_test_vals(std_magnitudes_score_2, 2)
results_3 = get_t_test_vals(std_magnitudes_score_3, 3)
t_test_df = rbind(results_1, results_2, results_3) %>% data.frame %>%
dplyr::rename('SFARI_score' = X1, 'Comparison' = X2, 'mean_group_1' = X3, 'mean_group_2' = X4, 'p_val' = X5) %>%
mutate(BH_p_val = p.adjust(as.numeric(as.character(p_val)), method = 'BH')) %>%
mutate(same_mean = ifelse(BH_p_val<0.05, 'No', 'Possible'))